In [6]:
# Author: Stephen Situ
# This is a practice project on cleaning data using panda dataframes in python on jupyter
# The original dataset can be found here: https://www.kaggle.com/datasets/gagandeep16/car-sales
In [ ]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib
In [8]:
# Read Data
car_sales_data = pd.read_csv("Car_sales.csv")
In [9]:
# Shape of Data
car_sales_data.shape
Out[9]:
(157, 16)
In [10]:
# Variable Types
car_sales_data.info
Out[10]:
<bound method DataFrame.info of     Manufacturer    Model  Sales_in_thousands  __year_resale_value  \
0          Acura  Integra              16.919               16.360   
1          Acura       TL              39.384               19.875   
2          Acura       CL              14.114               18.225   
3          Acura       RL               8.588               29.725   
4           Audi       A4              20.397               22.255   
..           ...      ...                 ...                  ...   
152        Volvo      V40               3.545                  NaN   
153        Volvo      S70              15.245                  NaN   
154        Volvo      V70              17.531                  NaN   
155        Volvo      C70               3.493                  NaN   
156        Volvo      S80              18.969                  NaN   

    Vehicle_type  Price_in_thousands  Engine_size  Horsepower  Wheelbase  \
0      Passenger               21.50          1.8       140.0      101.2   
1      Passenger               28.40          3.2       225.0      108.1   
2      Passenger                 NaN          3.2       225.0      106.9   
3      Passenger               42.00          3.5       210.0      114.6   
4      Passenger               23.99          1.8       150.0      102.6   
..           ...                 ...          ...         ...        ...   
152    Passenger               24.40          1.9       160.0      100.5   
153    Passenger               27.50          2.4       168.0      104.9   
154    Passenger               28.80          2.4       168.0      104.9   
155    Passenger               45.50          2.3       236.0      104.9   
156    Passenger               36.00          2.9       201.0      109.9   

     Width  Length  Curb_weight  Fuel_capacity  Fuel_efficiency Latest_Launch  \
0     67.3   172.4        2.639           13.2             28.0      2/2/2012   
1     70.3   192.9        3.517           17.2             25.0      6/3/2011   
2     70.6   192.0        3.470           17.2             26.0      1/4/2012   
3     71.4   196.6        3.850           18.0             22.0     3/10/2011   
4     68.2   178.0        2.998           16.4             27.0     10/8/2011   
..     ...     ...          ...            ...              ...           ...   
152   67.6   176.6        3.042           15.8             25.0     9/21/2011   
153   69.3   185.9        3.208           17.9             25.0    11/24/2012   
154   69.3   186.2        3.259           17.9             25.0     6/25/2011   
155   71.5   185.7        3.601           18.5             23.0     4/26/2011   
156   72.1   189.8        3.600           21.1             24.0    11/14/2011   

     Power_perf_factor  
0            58.280150  
1            91.370778  
2                  NaN  
3            91.389779  
4            62.777639  
..                 ...  
152          66.498812  
153          70.654495  
154          71.155978  
155         101.623357  
156          85.735655  

[157 rows x 16 columns]>
In [14]:
# DataFrame Heads
car_sales_data.head(5)
Out[14]:
Manufacturer Model Sales_in_thousands __year_resale_value Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch Power_perf_factor
0 Acura Integra 16.919 16.360 Passenger 21.50 1.8 140.0 101.2 67.3 172.4 2.639 13.2 28.0 2/2/2012 58.280150
1 Acura TL 39.384 19.875 Passenger 28.40 3.2 225.0 108.1 70.3 192.9 3.517 17.2 25.0 6/3/2011 91.370778
2 Acura CL 14.114 18.225 Passenger NaN 3.2 225.0 106.9 70.6 192.0 3.470 17.2 26.0 1/4/2012 NaN
3 Acura RL 8.588 29.725 Passenger 42.00 3.5 210.0 114.6 71.4 196.6 3.850 18.0 22.0 3/10/2011 91.389779
4 Audi A4 20.397 22.255 Passenger 23.99 1.8 150.0 102.6 68.2 178.0 2.998 16.4 27.0 10/8/2011 62.777639
In [15]:
# DataFrame Tails
car_sales_data.tail(5)
Out[15]:
Manufacturer Model Sales_in_thousands __year_resale_value Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch Power_perf_factor
152 Volvo V40 3.545 NaN Passenger 24.4 1.9 160.0 100.5 67.6 176.6 3.042 15.8 25.0 9/21/2011 66.498812
153 Volvo S70 15.245 NaN Passenger 27.5 2.4 168.0 104.9 69.3 185.9 3.208 17.9 25.0 11/24/2012 70.654495
154 Volvo V70 17.531 NaN Passenger 28.8 2.4 168.0 104.9 69.3 186.2 3.259 17.9 25.0 6/25/2011 71.155978
155 Volvo C70 3.493 NaN Passenger 45.5 2.3 236.0 104.9 71.5 185.7 3.601 18.5 23.0 4/26/2011 101.623357
156 Volvo S80 18.969 NaN Passenger 36.0 2.9 201.0 109.9 72.1 189.8 3.600 21.1 24.0 11/14/2011 85.735655
In [16]:
# Summary / Descriptive Statistics
car_sales_data.describe
Out[16]:
<bound method NDFrame.describe of     Manufacturer    Model  Sales_in_thousands  __year_resale_value  \
0          Acura  Integra              16.919               16.360   
1          Acura       TL              39.384               19.875   
2          Acura       CL              14.114               18.225   
3          Acura       RL               8.588               29.725   
4           Audi       A4              20.397               22.255   
..           ...      ...                 ...                  ...   
152        Volvo      V40               3.545                  NaN   
153        Volvo      S70              15.245                  NaN   
154        Volvo      V70              17.531                  NaN   
155        Volvo      C70               3.493                  NaN   
156        Volvo      S80              18.969                  NaN   

    Vehicle_type  Price_in_thousands  Engine_size  Horsepower  Wheelbase  \
0      Passenger               21.50          1.8       140.0      101.2   
1      Passenger               28.40          3.2       225.0      108.1   
2      Passenger                 NaN          3.2       225.0      106.9   
3      Passenger               42.00          3.5       210.0      114.6   
4      Passenger               23.99          1.8       150.0      102.6   
..           ...                 ...          ...         ...        ...   
152    Passenger               24.40          1.9       160.0      100.5   
153    Passenger               27.50          2.4       168.0      104.9   
154    Passenger               28.80          2.4       168.0      104.9   
155    Passenger               45.50          2.3       236.0      104.9   
156    Passenger               36.00          2.9       201.0      109.9   

     Width  Length  Curb_weight  Fuel_capacity  Fuel_efficiency Latest_Launch  \
0     67.3   172.4        2.639           13.2             28.0      2/2/2012   
1     70.3   192.9        3.517           17.2             25.0      6/3/2011   
2     70.6   192.0        3.470           17.2             26.0      1/4/2012   
3     71.4   196.6        3.850           18.0             22.0     3/10/2011   
4     68.2   178.0        2.998           16.4             27.0     10/8/2011   
..     ...     ...          ...            ...              ...           ...   
152   67.6   176.6        3.042           15.8             25.0     9/21/2011   
153   69.3   185.9        3.208           17.9             25.0    11/24/2012   
154   69.3   186.2        3.259           17.9             25.0     6/25/2011   
155   71.5   185.7        3.601           18.5             23.0     4/26/2011   
156   72.1   189.8        3.600           21.1             24.0    11/14/2011   

     Power_perf_factor  
0            58.280150  
1            91.370778  
2                  NaN  
3            91.389779  
4            62.777639  
..                 ...  
152          66.498812  
153          70.654495  
154          71.155978  
155         101.623357  
156          85.735655  

[157 rows x 16 columns]>
In [18]:
# Check Data Types
car_sales_data.dtypes
Out[18]:
Manufacturer            object
Model                   object
Sales_in_thousands     float64
__year_resale_value    float64
Vehicle_type            object
Price_in_thousands     float64
Engine_size            float64
Horsepower             float64
Wheelbase              float64
Width                  float64
Length                 float64
Curb_weight            float64
Fuel_capacity          float64
Fuel_efficiency        float64
Latest_Launch           object
Power_perf_factor      float64
dtype: object
In [22]:
# Create dataframe of only categorical columns
categorical = car_sales_data.dtypes[car_sales_data.dtypes == "object"].index
print(categorical)
car_sales_data[categorical]
Index(['Manufacturer', 'Model', 'Vehicle_type', 'Latest_Launch'], dtype='object')
Out[22]:
Manufacturer Model Vehicle_type Latest_Launch
0 Acura Integra Passenger 2/2/2012
1 Acura TL Passenger 6/3/2011
2 Acura CL Passenger 1/4/2012
3 Acura RL Passenger 3/10/2011
4 Audi A4 Passenger 10/8/2011
... ... ... ... ...
152 Volvo V40 Passenger 9/21/2011
153 Volvo S70 Passenger 11/24/2012
154 Volvo V70 Passenger 6/25/2011
155 Volvo C70 Passenger 4/26/2011
156 Volvo S80 Passenger 11/14/2011

157 rows × 4 columns

In [23]:
# Sorting the first 15 items in Model
sorted(car_sales_data["Model"])[0:15]
Out[23]:
['3-Sep',
 '3000GT',
 '300M',
 '323i',
 '328i',
 '4Runner',
 '5-Sep',
 '528i',
 'A4',
 'A6',
 'A8',
 'Accent',
 'Accord',
 'Alero',
 'Altima']
In [25]:
# Describe only Manufacterer column
car_sales_data["Manufacturer"].describe
Out[25]:
<bound method NDFrame.describe of 0      Acura
1      Acura
2      Acura
3      Acura
4       Audi
       ...  
152    Volvo
153    Volvo
154    Volvo
155    Volvo
156    Volvo
Name: Manufacturer, Length: 157, dtype: object>
In [30]:
# Check Unique Values in Model
car_sales_data["Model"].unique()
Out[30]:
array(['Integra', 'TL', 'CL', 'RL', 'A4', 'A6', 'A8', '323i', '328i',
       '528i', 'Century', 'Regal', 'Park Avenue', 'LeSabre', 'DeVille',
       'Seville', 'Eldorado', 'Catera', 'Escalade', 'Cavalier', 'Malibu',
       'Lumina', 'Monte Carlo', 'Camaro', 'Corvette', 'Prizm', 'Metro',
       'Impala', 'Sebring Coupe', 'Sebring Conv.', 'Concorde', 'Cirrus',
       'LHS', 'Town & Country', '300M', 'Neon', 'Avenger', 'Stratus',
       'Intrepid', 'Viper', 'Ram Pickup', 'Ram Wagon', 'Ram Van',
       'Dakota', 'Durango', 'Caravan', 'Escort', 'Mustang', 'Contour',
       'Taurus', 'Focus', 'Crown Victoria', 'Explorer', 'Windstar',
       'Expedition', 'Ranger', 'F-Series', 'Civic', 'Accord', 'CR-V',
       'Passport', 'Odyssey', 'Accent', 'Elantra', 'Sonata', 'I30',
       'S-Type', 'Wrangler', 'Cherokee', 'Grand Cherokee', 'ES300',
       'GS300', 'GS400', 'LS400', 'LX470', 'RX300', 'Continental',
       'Town car', 'Navigator', 'Mirage', 'Eclipse', 'Galant', 'Diamante',
       '3000GT', 'Montero', 'Montero Sport', 'Mystique', 'Cougar',
       'Sable', 'Grand Marquis', 'Mountaineer', 'Villager', 'C-Class',
       'E-Class', 'S-Class', 'SL-Class', 'SLK', 'SLK230', 'CLK Coupe',
       'CL500', 'M-Class', 'Sentra', 'Altima', 'Maxima', 'Quest',
       'Pathfinder', 'Xterra', 'Frontier', 'Cutlass', 'Intrigue', 'Alero',
       'Aurora', 'Bravada', 'Silhouette', 'Breeze', 'Voyager', 'Prowler',
       'Sunfire', 'Grand Am', 'Firebird', 'Grand Prix', 'Bonneville',
       'Montana', 'Boxter', 'Carrera Coupe', 'Carrera Cabrio', '5-Sep',
       '3-Sep', 'SL', 'SC', 'SW', 'LW', 'LS', 'Outback', 'Forester',
       'Corolla', 'Camry', 'Avalon', 'Celica', 'Tacoma', 'Sienna', 'RAV4',
       '4Runner', 'Land Cruiser', 'Golf', 'Jetta', 'Passat', 'Cabrio',
       'GTI', 'Beetle', 'S40', 'V40', 'S70', 'V70', 'C70', 'S80'],
      dtype=object)
In [89]:
# Find Nulls 
nan_rows  = car_sales_data[car_sales_data.isna().any(axis=1)]
print(nan_rows)
nan_rows.index
    Manufacturer           Model  Sales_in_thousands  __year_resale_value  \
2          Acura              CL              14.114               18.225   
7            BMW            323i              19.747                  NaN   
15      Cadillac         Seville              15.943               27.100   
18      Cadillac        Escalade              14.785                  NaN   
27     Chevrolet          Impala             107.995                  NaN   
33      Chrysler  Town & Country              53.480               19.540   
34      Chrysler            300M              30.696                  NaN   
38         Dodge        Intrepid              88.028               12.275   
44         Dodge         Durango             101.323                  NaN   
50          Ford           Focus             175.670                  NaN   
66        Jaguar          S-Type              15.467                  NaN   
72         Lexus           GS400               3.334                  NaN   
74         Lexus           LX470               9.126                  NaN   
75         Lexus           RX300              51.238                  NaN   
78       Lincoln       Navigator              22.925                  NaN   
96    Mercedes-B             SLK               7.998                  NaN   
97    Mercedes-B          SLK230               1.526                  NaN   
98    Mercedes-B       CLK Coupe              11.592                  NaN   
99    Mercedes-B           CL500               0.954                  NaN   
100   Mercedes-B         M-Class              28.976                  NaN   
106       Nissan          Xterra              54.158                  NaN   
107       Nissan        Frontier              65.005                  NaN   
109   Oldsmobile        Intrigue              38.554                  NaN   
110   Oldsmobile           Alero              80.255                  NaN   
117     Plymouth         Prowler               1.872                  NaN   
123      Pontiac         Montana              39.572                  NaN   
127         Saab           5-Sep               9.191                  NaN   
128         Saab           3-Sep              12.115                  NaN   
132       Saturn              LW               8.472                  NaN   
133       Saturn              LS              49.989                  NaN   
134       Subaru         Outback              47.107                  NaN   
135       Subaru        Forester              33.028                  NaN   
141       Toyota          Sienna              65.119                  NaN   
150   Volkswagen          Beetle              49.463                  NaN   
151        Volvo             S40              16.957                  NaN   
152        Volvo             V40               3.545                  NaN   
153        Volvo             S70              15.245                  NaN   
154        Volvo             V70              17.531                  NaN   
155        Volvo             C70               3.493                  NaN   
156        Volvo             S80              18.969                  NaN   

    Vehicle_type  Price_in_thousands  Engine_size  Horsepower  Wheelbase  \
2      Passenger                 NaN          3.2       225.0      106.9   
7      Passenger              26.990          2.5       170.0      107.3   
15     Passenger              44.475          4.6       275.0      112.2   
18           Car              46.225          5.7       255.0      117.5   
27     Passenger              18.890          3.4       180.0      110.5   
33           Car                 NaN          NaN         NaN        NaN   
34     Passenger              29.185          3.5       253.0      113.0   
38     Passenger              22.505          2.7       202.0      113.0   
44           Car              26.310          5.2       230.0      115.7   
50     Passenger              12.315          2.0       107.0      103.0   
66     Passenger              42.800          3.0       240.0      114.5   
72     Passenger              46.305          4.0       300.0      110.2   
74           Car              60.105          4.7       230.0      112.2   
75           Car              34.605          3.0       220.0      103.0   
78           Car              42.660          5.4       300.0      119.0   
96     Passenger              38.900          2.3       190.0       94.5   
97     Passenger              41.000          2.3       185.0       94.5   
98     Passenger              41.600          3.2       215.0      105.9   
99     Passenger              85.500          5.0       302.0      113.6   
100          Car              35.300          3.2       215.0      111.0   
106          Car              22.799          3.3       170.0      104.3   
107          Car              17.890          3.3       170.0      116.1   
109    Passenger              24.150          3.5       215.0      109.0   
110    Passenger              18.270          2.4       150.0      107.0   
117    Passenger              43.000          3.5       253.0      113.3   
123          Car              25.635          3.4       185.0      120.0   
127    Passenger              33.120          2.3       170.0      106.4   
128    Passenger              26.100          2.0       185.0      102.6   
132    Passenger              18.835          2.2       137.0      106.5   
133    Passenger              15.010          2.2       137.0      106.5   
134    Passenger              22.695          2.5       165.0      103.5   
135          Car              20.095          2.5       165.0       99.4   
141          Car              22.368          3.0       194.0      114.2   
150    Passenger              15.900          2.0       115.0       98.9   
151    Passenger              23.400          1.9       160.0      100.5   
152    Passenger              24.400          1.9       160.0      100.5   
153    Passenger              27.500          2.4       168.0      104.9   
154    Passenger              28.800          2.4       168.0      104.9   
155    Passenger              45.500          2.3       236.0      104.9   
156    Passenger              36.000          2.9       201.0      109.9   

     Width  Length  Curb_weight  Fuel_capacity  Fuel_efficiency Latest_Launch  \
2     70.6   192.0        3.470           17.2             26.0      1/4/2012   
7     68.4   176.0        3.179           16.6             26.0     6/28/2011   
15    75.0   201.0          NaN           18.5             22.0     4/29/2011   
18    77.0   201.2        5.572           30.0             15.0     4/17/2012   
27    73.0   200.0        3.389           17.0             27.0     6/18/2011   
33     NaN     NaN          NaN            NaN              NaN     7/13/2011   
34    74.4   197.8        3.567           17.0             23.0     2/10/2012   
38    74.7   203.7        3.489           17.0              NaN      6/2/2012   
44    71.7   193.5        4.394           25.0             17.0     6/27/2012   
50    66.9   174.8        2.564           13.2             30.0     7/22/2012   
66    71.6   191.3        3.650           18.4             21.0     11/3/2012   
72    70.9   189.2        3.693           19.8             21.0    11/28/2012   
74    76.4   192.5        5.401           25.4             15.0    10/30/2012   
75    71.5   180.1        3.900           17.2             21.0      1/4/2012   
78    79.9   204.8        5.393           30.0             15.0    12/23/2012   
96    67.5   157.9        3.055           15.9             26.0     1/16/2011   
97    67.5   157.3        2.975           14.0             27.0      8/6/2011   
98    67.8   180.3        3.213           16.4             26.0      7/8/2011   
99    73.1   196.6        4.115           23.2             20.0     4/11/2011   
100   72.2   180.6        4.387           19.0             20.0     2/10/2011   
106   70.4   178.0        3.821           19.4             18.0     1/24/2011   
107   66.5   196.1        3.217           19.4             18.0     8/27/2011   
109   73.6   195.9        3.455           18.0              NaN      4/1/2011   
110   70.1   186.7        2.958           15.0             27.0    10/20/2009   
117   76.3   165.4        2.850           12.0             21.0     6/27/2012   
123   72.7   201.3        3.942           25.0             23.0     7/22/2012   
127   70.6   189.2        3.280           18.5             23.0     11/9/2012   
128   67.4   182.2        2.990           16.9             23.0     6/12/2011   
132   69.0   190.4        3.075           13.1             27.0      8/5/2011   
133   69.0   190.4        2.910           13.1             28.0     12/4/2012   
134   67.5   185.8        3.415           16.9             25.0      7/7/2011   
135   68.3   175.2        3.125           15.9             24.0     9/10/2012   
141   73.4   193.5        3.759           20.9             22.0     10/5/2012   
150   67.9   161.1        2.769           14.5             26.0    10/20/2011   
151   67.6   176.6        2.998           15.8             25.0     2/18/2011   
152   67.6   176.6        3.042           15.8             25.0     9/21/2011   
153   69.3   185.9        3.208           17.9             25.0    11/24/2012   
154   69.3   186.2        3.259           17.9             25.0     6/25/2011   
155   71.5   185.7        3.601           18.5             23.0     4/26/2011   
156   72.1   189.8        3.600           21.1             24.0    11/14/2011   

     Power_perf_factor  
2                  NaN  
7            71.191207  
15          115.621358  
18          109.509117  
27           71.838039  
33                 NaN  
34          101.655244  
38           80.831470  
44           92.854125  
50           43.117132  
66          102.178985  
72          125.013357  
74          105.760458  
75           91.943802  
78          123.972047  
96           82.807362  
97           81.848969  
98           92.925792  
99          141.100985  
100          90.495532  
106          69.782944  
107          67.889271  
109          86.272523  
110          60.727447  
117         106.984456  
123          76.208440  
127          73.503778  
128          76.023048  
132          56.295243  
133          54.819728  
134          67.765908  
135          66.762943  
141          78.027219  
150          47.329632  
151          66.113057  
152          66.498812  
153          70.654495  
154          71.155978  
155         101.623357  
156          85.735655  
Out[89]:
Int64Index([  2,   7,  15,  18,  27,  33,  34,  38,  44,  50,  66,  72,  74,
             75,  78,  96,  97,  98,  99, 100, 106, 107, 109, 110, 117, 123,
            127, 128, 132, 133, 134, 135, 141, 150, 151, 152, 153, 154, 155,
            156],
           dtype='int64')
In [98]:
# Remove any rows with Nulls
car_sales_clean = car_sales_data.drop(labels=nan_rows.index,axis=0,inplace=False)
In [99]:
# Find Duplicates
duplicate_rows = car_sales_clean[car_sales_clean.duplicated(keep='last')]
print(duplicate_rows)
Empty DataFrame
Columns: [Manufacturer, Model, Sales_in_thousands, __year_resale_value, Vehicle_type, Price_in_thousands, Engine_size, Horsepower, Wheelbase, Width, Length, Curb_weight, Fuel_capacity, Fuel_efficiency, Latest_Launch, Power_perf_factor]
Index: []
In [101]:
# Find Null in a column 
car_sales_clean[car_sales_clean[['Horsepower']].isna().any(axis=1)]
Out[101]:
Manufacturer Model Sales_in_thousands __year_resale_value Vehicle_type Price_in_thousands Engine_size Horsepower Wheelbase Width Length Curb_weight Fuel_capacity Fuel_efficiency Latest_Launch Power_perf_factor
In [102]:
# Group by Manufacturer / Model and Sales
car_sales_clean.groupby(['Manufacturer','Model'])['Sales_in_thousands'].sum()
Out[102]:
Manufacturer  Model  
Acura         Integra    16.919
              RL          8.588
              TL         39.384
Audi          A4         20.397
              A6         18.780
                          ...  
Volkswagen    Cabrio      9.569
              GTI         5.596
              Golf        9.761
              Jetta      83.721
              Passat     51.102
Name: Sales_in_thousands, Length: 117, dtype: float64
In [97]:
# Save Cleaned Csv
car_sales_clean.to_csv('car_sales_clean.csv')